### Code for cleaning the patent description and pubmed article abstract text
### and using the cleaned text to train our fasttext model
## Note: 
## Running this file requires high computer memory (at least 200 GB RAM).

################################################################################
###                        Building Patent Text Data 
################################################################################
library(data.table)
library(dplyr)
library(tidyr)
library(stringr)
library(stringi)

setwd("YOUR WORKING DIRECTORY")

### load the 3 patent text datasets to find the patent id list we use 
### in our regressions
## Patent title & absract 2,500
tas_2500 <- fread("/analysis/clean_data/patent_data/patent_tas_2500.tsv", sep = "\t") %>% as_tibble() %>%
  
  # only select id and summary text columns
  select(patent_id, brf_sum_text) %>%
  
  # rename for distinguishing after merging
  rename(text_2500 = brf_sum_text)

## Patent title & abstract 10,000
tas_10000 <- fread("data_cleaning/intermediate_data/patent_tas_10000.tsv", sep = "\t") %>% as_tibble() %>%
  
  # only select id and summary text columns
  select(patent_id, brf_sum_text) %>%
  
  # rename for distinguishing after merging
  rename(text_10000 = brf_sum_text)

## Patent title text from PatentView
title_only <- fread("/analysis/clean_data/patent_data/patent.tsv", sep = "\t", quote = "") %>% as_tibble() %>%
  
  # change patent id to numeric and get rid of NAs to be consistent with the 2 datasets above
  mutate(id = as.numeric(id)) %>%
  filter(!is.na(id)) %>% select(id, title)

## merge them together
pat_tas <- tas_2500 %>%
  left_join(tas_10000, by = "patent_id") %>%
  left_join(title_only, by = c("patent_id" = "id"))

### read in the huge patent detail description text files by year
# function to read patent text data from different years
read_pat_txt <- function(year) {
  fread(paste("./detail_desc_text_", year, ".tsv", sep = ''), sep = "\t") %>% 
    as_tibble() %>%
    select(patent_id, text)
}

# function to read patent text data from 2001 to 2004 (special column names)
# (since this data is too large, we are not including it in our replication package)
read_pat_txt_special <- function(year) {
  fread(paste("./detail_desc_text_", year, ".tsv", sep = ''), sep = "\t") %>% 
    as_tibble() %>%
    select(patent_id, detail_description_text)
}

## read and single out needed IDs by groups of files, as RAM requirement to do all at once is impossible 
# function to clean one group of files
pat_dtl_desc1 <- function(start, end) {
  bind_rows(lapply(start:end, read_pat_txt)) %>% as_tibble() %>%
    mutate(patent_id = as.numeric(patent_id)) %>%
    filter(!is.na(patent_id), patent_id %in% pat_tas$patent_id)
}

pat_dtl_desc1_special <- function(start, end) {
  bind_rows(lapply(start:end, read_pat_txt_special)) %>% as_tibble() %>%
    mutate(patent_id = as.numeric(patent_id)) %>%
    filter(!is.na(patent_id), patent_id %in% pat_tas$patent_id)
}

fwrite(pat_dtl_desc1(1976, 1985), "pat_dtl_desc1.csv")
fwrite(pat_dtl_desc1(1986, 1990), "pat_dtl_desc2.csv")
fwrite(pat_dtl_desc1(1991, 1995), "pat_dtl_desc3.csv")
fwrite(pat_dtl_desc1(1996, 1997), "pat_dtl_desc4.csv")
fwrite(pat_dtl_desc1(1999, 2000), "pat_dtl_desc5.csv")
fwrite(pat_dtl_desc1_special(2001, 2004), "pat_dtl_desc6.csv")
fwrite(pat_dtl_desc1(2005, 2010), "pat_dtl_desc7.csv")
fwrite(pat_dtl_desc1(2012, 2014), "pat_dtl_desc8.csv")


## 3 years (1998, 2011, and 2015) are unreadable by fread, using read_tsv from readr
# functions to read patent text data and then clean it as above
# (since this data is too large, we are not including it in our replication package)
readr_pat_txt <- function(year) {
  read_tsv(paste("./detail_desc_text_", year, ".tsv", sep = ''), 
           col_types = cols(
             uuid = col_character(),
             patent_id = col_character(),
             text = col_character(),
             length = col_double()
           )) %>%
    as_tibble() %>%
    select(patent_id, text)
}

pat_dtl_desc_readr <- function(range) {
  bind_rows(lapply(range, readr_pat_txt)) %>% as_tibble() %>%
    mutate(patent_id = as.numeric(patent_id)) %>%
    filter(!is.na(patent_id), patent_id %in% pat_tas$patent_id)
}

fwrite(pat_dtl_desc_readr(c(1998, 2011, 2015)), "pat_dtl_desc9.csv")

## read in the filtered data and compile them together
# function to read the cleaned data from different years
# (since this data is too large, we are not including it in our replication package)
read_cleaned <- function(file_num) {
  fread(paste("./pat_dtl_desc", file_num, ".csv", sep = '')) %>% 
    as_tibble() %>% mutate(patent_id = as.integer(patent_id))
}

## combine together
pat_dtl_desc <- bind_rows(lapply(1:9, read_cleaned)) %>% as_tibble()

# fix the special column name issue, so that the final dataset only has columns id and text
pat_dtl_desc$text <- ifelse(is.na(pat_dtl_desc$text), pat_dtl_desc$detail_description_text, pat_dtl_desc$text)
text_pat <- select(pat_dtl_desc, -detail_description_text) %>%
  mutate(type = 1) %>% rename(doc_id = patent_id)

################################################################################
###                        Building PubMed Text Data 
################################################################################

### read in all pubmed and patent text data
## read in 1976-2020 article meta data from the xmls
meta <- fread("data_cleaning/intermediate_data/all_article_meta.csv") %>% as_tibble()

## first get rid of all pmids with duplicates. We are not sure about why
## duplicates are generated, but those are only 1,356 out of 30,419,056 pmids.
meta <- meta %>% filter(!(duplicated(pmid) | duplicated(pmid, fromLast = TRUE)))

## getting abstracts for all articles
abs_pubmed <- meta %>%
  select(abstract) %>% filter(abstract != "") %>%
  mutate(type = 2) %>% rename(doc_id = pmid)

################################################################################
###                  Merge the Two Text Datasets and Clean 
################################################################################

### build the input text data for fasttext
## first lightly clean the strings
# text cleaning function
text_clean <- function(string) {
  string %>% str_trim() %>% 
    str_replace_all("'s", "") %>%
    str_replace_all("[,.:;]", "") %>%
    str_replace_all("[:punct:]", " ") %>%
    str_replace_all("[^[:alpha:]|^[:blank:]]", "") %>%
    str_replace_all("\\b\\w{1}\\b", "") %>%
    gsub(pattern = "(\\b\\w*[a-z]\\w*\\b)", replacement = "\\L\\1", perl=TRUE) %>%
    str_replace_all("\\s+", " ") %>%
    str_trim()
}

input_text <- text_pat %>%
  bind_rows(rename(abs_pubmed, text = abstract)) %>% as.data.table() %>%
  split(f = 1:800)

### do the actual cleaning with parallel in LSF
# Now apply the future package to parallel the work
library(future)

## Original Source: https://github.com/hbs-rcs/sample_code/blob/master/R/R_parallel.R

## If using multiple cores on the grid is still too slow we can use multiple
## nodes on the compute grid. See https://grid.rcs.hbs.org/ for information.
## NOTE: this will only work on systems (like the HBS grid) with LSF available.

library(future.batchtools)
library(future.apply)

options(future.globals.maxSize = +Inf)

row_clean <- function(dat) {
  options(future.globals.maxSize = +Inf)
  dat[, text := text_clean(text)]
}

## Download template to the working directory:

download.file("https://raw.githubusercontent.com/mllg/batchtools/master/inst/templates/lsf-simple.tmpl", "lsf-simple.tmpl")

# Plan to use lsf (note that walltime is in seconds)
plan(batchtools_lsf,
     template = "lsf-simple.tmpl",
     resources = list(walltime = 50400, memory = "40G", queue = "short"),
     workers = 100)

## Use same code as before, but now runs on LSF nodes instead of local CPU
## cores!

pubmed_pat_text_cleaned <- rbindlist(future_lapply(input_text, row_clean))

#Write the df out as a csv
fwrite(list(pubmed_pat_text_cleaned$text), "input_fasttext.txt", quote = F)

## Also extract and write out the pubmed cleaned abstract text for future use
txt_cleaned_all <- pubmed_pat_text_cleaned %>% filter(type == 2) %>%
  rename(PMID = doc_id) %>% select(-type) %>%
  filter(text != "")

# (since this data is too large, we are not including it in our replication package)
fwrite(txt_cleaned_all, "txt_cleaned_all.csv")

################################################################################
###                        Training Fasttext Model 
################################################################################
## Code we run in fasttext in the shell
# ./fasttext skipgram -input ../input_fasttext.txt -output ../fasttext_model

## Thus, fasttext_model.bin is our trained model

################################################################################
###                    Clinical Trial Prediction Model 
################################################################################
### read in the scraped data from xml files
meta <- fread("data_cleaning/intermediate_data/clinical_meta.csv") %>% as_tibble()

### variable cleaning
# a text cleaning function
text_clean <- function(string) {
  string %>%
    str_replace_all("[\r]|[\n]", "") %>%
    str_trim() %>%
    str_replace_all("\\s+", " ")
}

## whole cleaning
meta <- meta %>%
  mutate_all(text_clean) %>%
  mutate_at(7:9, ~str_extract(., "[[:digit:]]{4}")) %>%
  mutate(if_male = ifelse(trial_gender == "Male", 1, 0), 
         if_female = ifelse(trial_gender == "Female", 1, 0), 
         if_all = ifelse(trial_gender == "All", 1, 0)) %>%
  rename(subject_pool_criteria = eligibility_criteria)

# text training data based on trial text
text_train_trial <- 
  meta %>%
  # select those trials after 1994 and export the data
  filter(start_date >= 1994)

### Build out a cleaned version of the trial text corpus
trial_text <-  
  trial_text_raw %>% 
  mutate(
    brief_title = replace_na(brief_title, ""),
    official_title = replace_na(official_title, ""),
    official_title = replace_na(official_title, ""),
    brief_sum = replace_na(brief_sum, ""),
    detail_des = replace_na(detail_des, ""),
    primary_outcome_measure = replace_na(primary_outcome_measure, ""),
    secondary_outcome_measure = replace_na(secondary_outcome_measure, ""),
    primary_outcome_description = replace_na(primary_outcome_description, ""),
    secondary_outcome_description = replace_na(secondary_outcome_description, ""),
    condition = replace_na(condition, ""),
    keywords = replace_na(keywords, "")
  ) 

text_for_fast <- 
  trial_text %>% 
  mutate(
    text = str_c(brief_title,
                 official_title,
                 brief_sum,
                 condition,
                 keywords,
                 sep=" "),
    text = text_clean(text)
  ) %>% 
  select(
    nct_id, trial_gender, text
  )

## Female training and test data
supervised_data_female <- 
  text_for_fast %>% 
  group_by(trial_gender) %>% 
  ungroup() %>%
  mutate(
    trial_gender = ifelse(trial_gender == "Male", "All", trial_gender),
    text = str_c("__label__", trial_gender, " ", text)
  ) %>% 
  select(
    nct_id, text
  ) 

### Get the training and testing datasets for female and male, respectively
set.seed(90403)
rows_female <- sample(nrow(supervised_data_female))
suprvised_random_female <- supervised_data_female[rows_female, ]

## slice the data into training and testing sets
text_train_female <- suprvised_random_female[1:340000,] 
text_test_female <- suprvised_random_female[340001:nrow(suprvised_random_female),]

write_csv(text_train_female %>% select(-nct_id), 
          "../text_train_female.csv", 
          col_names=FALSE)
write_csv(text_test_female %>% select(-nct_id), 
          "../text_test_female.csv",
          col_names=FALSE)

## Male training and test data
supervised_data_male <- 
  text_for_fast %>% 
  group_by(trial_gender) %>% 
  ungroup() %>%
  mutate(
    trial_gender = ifelse(trial_gender == "Female", "All", trial_gender),
    text = str_c("__label__", trial_gender, " ", text)
  ) %>% 
  select(
    nct_id, text
  ) 

set.seed(12345)
rows_male <- sample(nrow(supervised_data_male))
suprvised_random_male <- supervised_data_male[rows_male, ]

text_train_male <- suprvised_random_male[1:340000,]
text_test_male <- suprvised_random_male[340001:nrow(suprvised_random_male),]

write_csv(text_train_male %>% select(-nct_id), 
          "../text_train_male.csv", 
          col_names=FALSE)
write_csv(text_test_male %>% select(-nct_id), 
          "../text_test_male.csv", 
          col_names=FALSE)

### Prepare the patent text data (using the 2500-character text)
## Function used for cleaning up the patent and medical trial texts
text_clean <- function(string) {
  string %>% str_trim() %>% 
    str_replace_all("'s", "") %>%
    str_replace_all("[,.:;]", "") %>%
    str_replace_all("[:punct:]", " ") %>%
    str_replace_all("[^[:alpha:]|^[:blank:]]", "") %>%
    str_replace_all("\\b\\w{1}\\b", "") %>%
    gsub(pattern = "(\\b\\w*[a-z]\\w*\\b)", 
         replacement = "\\L\\1", perl=TRUE) %>%
    str_replace_all("\\s+", " ") %>%
    str_trim() 
}

## Cleaning the text is slow, so we saved it out and just load it for speed

pat_text_clean <-
  pat_tas %>%
  mutate(
    text = str_c(title, text_2500, sep = " "),
    text = text_clean(text)
  ) %>%
  select(
    patent_id, title, text
  )

## Write out the patent data so we can generate predictions for each text
write_csv(pat_text_clean %>% select(text),  
          "../pat_text_for_prediction.csv", 
          col_names = FALSE)

## Code we run in fasttext in the shell
# We use the 100-dimensionl model we trained fasttext_model.bin 
# ./fasttext supervised -input text_train_female.csv -output supervised_model_female -pretrainedVectors fasttext_model.vec
# ./fasttext test supervised_model_female.bin text_test_female.csv
# ./fasttext predict-prob supervised_model_female.bin pat_text_for_prediction.csv > pat_predicted_female_trial.csv
# 
# ./fasttext supervised -input text_train_male.csv -output supervised_model_male -pretrainedVectors fasttext_model.vec
# ./fasttext test supervised_model_male.bin text_test_male.csv
# ./fasttext predict-prob supervised_model_male.bin pat_text_for_prediction.csv > pat_predicted_male_trial.csv
